{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "bff4d0e6",
   "metadata": {},
   "source": [
    "# AutoMQ Table Topic Demonstration\n",
    "\n",
    "This notebook demonstrates the core capabilities of AutoMQ Table Topic, including automatic table creation, Upsert mode for data synchronization, and data partitioning. The workflow creates a topic with Upsert and partitioning enabled, sends one Insert (I), Update (U), and Delete (D) message, and queries the Iceberg table after each operation."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9aa48324",
   "metadata": {},
   "source": [
    "## 1. Import Libraries and Define Helper Functions\n",
    "\n",
    "Import necessary libraries and define helper functions for key operations such as creating topics, producing messages, and querying Iceberg tables."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "345b7003",
   "metadata": {},
   "outputs": [],
   "source": [
    "import uuid\n",
    "from confluent_kafka import Producer\n",
    "from confluent_kafka.serialization import StringSerializer, SerializationContext, MessageField\n",
    "from confluent_kafka.schema_registry import SchemaRegistryClient\n",
    "from confluent_kafka.schema_registry.avro import AvroSerializer\n",
    "from confluent_kafka.admin import AdminClient, NewTopic\n",
    "from confluent_kafka.cimpl import KafkaException, KafkaError\n",
    "from datetime import datetime, timezone\n",
    "from faker import Faker\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql.utils import AnalysisException\n",
    "\n",
    "# Configuration constants\n",
    "KAFKA_BOOTSTRAP_SERVERS = 'automq:9092'\n",
    "SCHEMA_REGISTRY_URL = 'http://schema-registry:8081'\n",
    "TOPIC_NAME = 'web_page_view_events'\n",
    "\n",
    "# Initialize AdminClient and SchemaRegistryClient\n",
    "admin_client_conf = {'bootstrap.servers': KAFKA_BOOTSTRAP_SERVERS}\n",
    "admin_client = AdminClient(admin_client_conf)\n",
    "schema_registry_conf = {'url': SCHEMA_REGISTRY_URL}\n",
    "schema_registry_client = SchemaRegistryClient(schema_registry_conf)\n",
    "\n",
    "# Initialize SparkSession\n",
    "spark = SparkSession.builder.appName(\"AutoMQ Table Topic Demo\").getOrCreate()\n",
    "fake = Faker()\n",
    "\n",
    "# Helper function: Create a Kafka Topic\n",
    "def create_topic(topic_name, num_partitions=1, replication_factor=1, config=None):\n",
    "    if config is None:\n",
    "        config = {}\n",
    "    topics = [NewTopic(topic_name, num_partitions=num_partitions, replication_factor=replication_factor, config=config)]\n",
    "    futures = admin_client.create_topics(topics, operation_timeout=30)\n",
    "    for topic, future in futures.items():\n",
    "        try:\n",
    "            future.result()\n",
    "            print(f\"Topic '{topic}' created successfully.\")\n",
    "        except KafkaException as e:\n",
    "            error = e.args[0]\n",
    "            if error.code() == KafkaError.TOPIC_ALREADY_EXISTS:\n",
    "                print(f\"Topic '{topic}' already exists.\")\n",
    "            else:\n",
    "                raise Exception(f\"Failed to create topic '{topic}': {error.str()}\")\n",
    "\n",
    "# Helper function: Create a Producer\n",
    "def create_producer():\n",
    "    producer_conf = {'bootstrap.servers': KAFKA_BOOTSTRAP_SERVERS}\n",
    "    return Producer(producer_conf)\n",
    "\n",
    "# Helper function: Produce events to Kafka\n",
    "def produce_events(producer, topic_name, events_data, avro_serializer, string_serializer):\n",
    "    for event in events_data:\n",
    "        try:\n",
    "            producer.produce(\n",
    "                topic=topic_name,\n",
    "                key=string_serializer(event.event_id),\n",
    "                value=avro_serializer(event, SerializationContext(topic_name, MessageField.VALUE)),\n",
    "                on_delivery=delivery_report\n",
    "            )\n",
    "        except Exception as e:\n",
    "            print(f\"Failed to produce event {event.event_id}: {e}\")\n",
    "        producer.poll(0)\n",
    "    producer.flush()\n",
    "    print(f\"Successfully produced {len(events_data)} event(s) to {topic_name}.\")\n",
    "\n",
    "# Delivery report callback for produced messages\n",
    "def delivery_report(err, msg):\n",
    "    if err is not None:\n",
    "        print(f\"Message delivery failed: {err}\")\n",
    "        return\n",
    "    print(f\"Message delivered to {msg.topic()} [partition {msg.partition()}] at offset {msg.offset()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6dca093b",
   "metadata": {},
   "source": [
    "## 2. Create Topic with Upsert and Partitioning\n",
    "\n",
    "Create a Kafka topic with Table Topic enabled, configured for Upsert mode and partitioning. The topic uses `event_id` as the primary key, `ops` as the operation field, and partitions data by `bucket(page_url, 5)` and `hour(timestamp)`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0aa7bb63",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define Avro Schema with operation support\n",
    "schema_str = \"\"\"\n",
    "{\n",
    "  \"type\": \"record\",\n",
    "  \"name\": \"PageViewEvent\",\n",
    "  \"namespace\": \"com.example.events\",\n",
    "  \"fields\": [\n",
    "    {\"name\": \"event_id\", \"type\": \"string\"},\n",
    "    {\"name\": \"user_id\", \"type\": \"string\"},\n",
    "    {\"name\": \"timestamp\", \"type\": { \"type\": \"long\", \"logicalType\": \"timestamp-millis\" }},\n",
    "    {\"name\": \"page_url\", \"type\": \"string\"},\n",
    "    {\"name\": \"ip_address\", \"type\": \"string\"},\n",
    "    {\"name\": \"user_agent\", \"type\": \"string\"},\n",
    "    {\"name\": \"ops\", \"type\": \"string\"}\n",
    "  ]\n",
    "}\n",
    "\"\"\"\n",
    "\n",
    "# Define PageViewEvent class\n",
    "class PageViewEvent:\n",
    "    def __init__(self, event_id, user_id, timestamp, page_url, ip_address, user_agent, ops):\n",
    "        self.event_id = event_id\n",
    "        self.user_id = user_id\n",
    "        self.timestamp = timestamp\n",
    "        self.page_url = page_url\n",
    "        self.ip_address = ip_address\n",
    "        self.user_agent = user_agent\n",
    "        self.ops = ops\n",
    "\n",
    "# Serialization function for events\n",
    "def event_to_dict(event, ctx):\n",
    "    return {\n",
    "        \"event_id\": event.event_id,\n",
    "        \"user_id\": event.user_id,\n",
    "        \"timestamp\": event.timestamp,\n",
    "        \"page_url\": event.page_url,\n",
    "        \"ip_address\": event.ip_address,\n",
    "        \"user_agent\": event.user_agent,\n",
    "        \"ops\": event.ops\n",
    "    }\n",
    "\n",
    "# Create topic with Upsert and partitioning configurations\n",
    "topic_config = {\n",
    "    'automq.table.topic.enable': 'true',\n",
    "    'automq.table.topic.commit.interval.ms': '2000',\n",
    "    'automq.table.topic.schema.type': 'schema',\n",
    "    'automq.table.topic.upsert.enable': 'true',\n",
    "    'automq.table.topic.id.columns': '[event_id]',\n",
    "    'automq.table.topic.cdc.field': 'ops',\n",
    "    'automq.table.topic.partition.by': '[bucket(page_url, 5), hour(timestamp)]'\n",
    "}\n",
    "create_topic(TOPIC_NAME, config=topic_config)\n",
    "\n",
    "# Initialize serializers and producer\n",
    "avro_serializer = AvroSerializer(schema_registry_client, schema_str, event_to_dict)\n",
    "string_serializer = StringSerializer('utf_8')\n",
    "producer = create_producer()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "11c949ad",
   "metadata": {},
   "source": [
    "## 3. Insert Operation\n",
    "\n",
    "Produce an Insert (I) event to the topic."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2abe64e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "event_id = str(uuid.uuid4())\n",
    "current_timestamp = int(datetime.now(timezone.utc).timestamp() * 1000)\n",
    "insert_event = [PageViewEvent(event_id, fake.user_name(), current_timestamp, fake.uri_path(), fake.ipv4(), fake.user_agent(), \"I\")]\n",
    "produce_events(producer, TOPIC_NAME, insert_event, avro_serializer, string_serializer)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "70a9f918",
   "metadata": {},
   "source": [
    "## 4. Query After Insert\n",
    "\n",
    "Query the Iceberg table to verify the inserted record."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ebd1757",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.read.format(\"iceberg\").load(f\"default.{TOPIC_NAME}\")\n",
    "df.show()\n",
    "\n",
    "spark.sql(f\"SELECT file_path FROM default.{TOPIC_NAME}.files\").show(vertical=True, truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "98922933",
   "metadata": {},
   "source": [
    "## 5. Update Operation\n",
    "\n",
    "Produce an Update (U) event for the same `event_id` to update the record."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c22f0df8",
   "metadata": {},
   "outputs": [],
   "source": [
    "update_event = [PageViewEvent(event_id, fake.user_name(), current_timestamp + 1000, fake.uri_path(), fake.ipv4(), fake.user_agent(), \"U\")]\n",
    "produce_events(producer, TOPIC_NAME, update_event, avro_serializer, string_serializer)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "518d50a9",
   "metadata": {},
   "source": [
    "## 6. Query After Update\n",
    "\n",
    "Query the Iceberg table to verify the updated record."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f16244a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.read.format(\"iceberg\").load(f\"default.{TOPIC_NAME}\")\n",
    "df.show()\n",
    "\n",
    "spark.sql(f\"SELECT file_path FROM default.{TOPIC_NAME}.files\").show(vertical=True, truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f172213",
   "metadata": {},
   "source": [
    "## 7. Delete Operation\n",
    "\n",
    "Produce a Delete (D) event for the same `event_id` to remove the record."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03b8eb79",
   "metadata": {},
   "outputs": [],
   "source": [
    "delete_event = [PageViewEvent(event_id, fake.user_name(), current_timestamp + 2000, fake.uri_path(), fake.ipv4(), fake.user_agent(), \"D\")]\n",
    "produce_events(producer, TOPIC_NAME, delete_event, avro_serializer, string_serializer)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "89b62054",
   "metadata": {},
   "source": [
    "## 8. Query After Delete\n",
    "\n",
    "Query the Iceberg table to verify that the record has been removed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b221c2c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.read.format(\"iceberg\").load(f\"default.{TOPIC_NAME}\")\n",
    "df.show()\n",
    "\n",
    "spark.sql(f\"SELECT file_path FROM default.{TOPIC_NAME}.files\").show(vertical=True, truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff13c16e",
   "metadata": {},
   "source": [
    "## 9. Cleanup\n",
    "\n",
    "Delete the topic and drop the Iceberg table after the demonstration."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15679a45",
   "metadata": {},
   "outputs": [],
   "source": [
    "admin_client.delete_topics([TOPIC_NAME])\n",
    "spark.sql(f\"DROP TABLE default.{TOPIC_NAME}\")\n",
    "print(f\"Topic '{TOPIC_NAME}' and Iceberg table deleted.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
